Figures in supplementary material

This document will allow you to reproduce all supplementary figures.

Set working directory and load all necessary libraries and functions.

# If not installed, install and load the package "here", else: only load the package.
err <- try(library("here", character.only = TRUE), silent = TRUE)
## here() starts at /Users/lgoeminn/Documents/phD/MSqRobHurdlePaper
if (class(err) == 'try-error') {
  install.packages("here", repos = "https://cloud.r-project.org")
  library("here", character.only = TRUE)
}

wd <- here()

# Optional: change the working directory
setwd(wd)

# Load all functions and libraries
source(paste0(wd, "/R/functions.r"))
## Loading required namespace: BiocManager
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: mzR
## Loading required package: Rcpp
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.0)
## than is installed on your system (1.0.1). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.6.1
## Loading required package: stats4
## Warning: multiple methods tables found for 'lengths'
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: ProtGenerics
## 
## This is MSnbase version 2.8.3 
##   Visit https://lgatto.github.io/MSnbase/ to get started.
## 
## Attaching package: 'MSnbase'
## The following object is masked from 'package:stats':
## 
##     smooth
## The following object is masked from 'package:base':
## 
##     trimws
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: future
## 
## Attaching package: 'future'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:S4Vectors':
## 
##     values
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 3.6.1
## Loading required package: GenomicRanges
## Warning: package 'GenomicRanges' was built under R version 3.6.1
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.6.1
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomeInfoDb
## Warning: no function found corresponding to methods exports from 'XVector'
## for: 'concatenateObjects'
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## Loading required package: BiocParallel
## Warning: multiple methods tables found for 'lengths'
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply
## 
## Attaching package: 'stageR'
## The following object is masked from 'package:methods':
## 
##     getMethod
source(paste0(wd,"/R/plot_functions.R"))

Give session info for reproducibility.

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] proDA_1.1.2                 msqrobsum_0.0.0.9000       
##  [3] MSqRob_0.7.6                stageR_1.3.29              
##  [5] SummarizedExperiment_1.14.1 DelayedArray_0.6.0         
##  [7] BiocParallel_1.14.2         matrixStats_0.54.0         
##  [9] GenomicRanges_1.36.1        GenomeInfoDb_1.16.0        
## [11] IRanges_2.18.3              edgeR_3.24.3               
## [13] furrr_0.1.0.9002            future_1.13.0              
## [15] readxl_1.3.1                colorspace_1.4-1           
## [17] zoo_1.8-6                   corpcor_1.6.9              
## [19] survival_2.44-1.1           ibb_13.06                  
## [21] lme4_1.1-21                 Matrix_1.2-17              
## [23] limma_3.36.5                MSnbase_2.8.3              
## [25] ProtGenerics_1.12.0         S4Vectors_0.22.1           
## [27] mzR_2.16.1                  Rcpp_1.0.1                 
## [29] Biobase_2.40.0              BiocGenerics_0.26.0        
## [31] forcats_0.4.0               stringr_1.4.0              
## [33] dplyr_0.8.1                 purrr_0.3.2                
## [35] readr_1.3.1                 tidyr_0.8.3                
## [37] tibble_2.1.3                ggplot2_3.1.1              
## [39] tidyverse_1.2.1             MSstats_3.12.2             
## [41] usethis_1.5.0               devtools_2.0.2             
## [43] remotes_2.0.4               preprocessCore_1.42.0      
## [45] here_0.1                   
## 
## loaded via a namespace (and not attached):
##  [1] snow_0.4-3             backports_1.1.4        plyr_1.8.4            
##  [4] lazyeval_0.2.2         splines_3.6.0          listenv_0.7.0         
##  [7] digest_0.6.19          foreach_1.4.4          BiocInstaller_1.30.0  
## [10] htmltools_0.3.6        gdata_2.18.0           magrittr_1.5          
## [13] memoise_1.1.0          doParallel_1.0.14      globals_0.12.4        
## [16] modelr_0.1.4           prettyunits_1.0.2      rvest_0.3.4           
## [19] ggrepel_0.8.1          haven_2.1.0            xfun_0.7              
## [22] callr_3.2.0            crayon_1.3.4           RCurl_1.95-4.12       
## [25] jsonlite_1.6           impute_1.54.0          iterators_1.0.10      
## [28] glue_1.3.1             gtable_0.3.0           zlibbioc_1.26.0       
## [31] XVector_0.20.0         pkgbuild_1.0.3         future.apply_1.3.0    
## [34] scales_1.0.0           vsn_3.48.1             httr_1.4.0            
## [37] gplots_3.0.1.1         pkgconfig_2.0.2        XML_3.98-1.20         
## [40] locfit_1.5-9.1         tidyselect_0.2.5       rlang_0.4.2           
## [43] reshape2_1.4.3         munsell_0.5.0          cellranger_1.1.0      
## [46] tools_3.6.0            cli_1.1.0              generics_0.0.2        
## [49] broom_0.5.2            evaluate_0.14          mzID_1.18.0           
## [52] yaml_2.2.0             processx_3.3.1         knitr_1.23            
## [55] fs_1.3.1               caTools_1.17.1.2       randomForest_4.6-14   
## [58] ncdf4_1.16.1           nlme_3.1-140           xml2_1.2.0            
## [61] compiler_3.6.0         rstudioapi_0.10        testthat_2.1.1        
## [64] affyio_1.50.0          marray_1.58.0          stringi_1.4.3         
## [67] ps_1.3.0               desc_1.2.0             lattice_0.20-38       
## [70] nloptr_1.2.1           pillar_1.4.1           BiocManager_1.30.4    
## [73] MALDIquant_1.19.3      data.table_1.12.2      bitops_1.0-6          
## [76] R6_2.4.0               pcaMethods_1.72.0      affy_1.58.0           
## [79] KernSmooth_2.23-15     sessioninfo_1.1.1      codetools_0.2-16      
## [82] boot_1.3-22            MASS_7.3-51.4          gtools_3.8.1          
## [85] assertthat_0.2.1       pkgload_1.0.2          rprojroot_1.3-2       
## [88] minpack.lm_1.2-1       withr_2.1.2            GenomeInfoDbData_1.1.0
## [91] doSNOW_1.0.16          hms_0.4.2              grid_3.6.0            
## [94] minqa_1.2.4            rmarkdown_1.13         lubridate_1.7.4

Important: make sure you have run the CPTAC and HEART analyses, or load the necessary files as follows:

load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC2.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_KNN1.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_Perseus_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_mixed.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_RegEM_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_co_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_edgeR_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_beta_binom_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_Hurdle.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_PI_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_KNN1_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_RegEM_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_mixed_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProDA_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProPCA_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_mixture.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_DEP.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_DEP_noimp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_noimp_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_imp_full.RData"))
load(paste0(wd, "/save_files_PXD006675/peptides_HEART2.RData"))
load(paste0(wd, "/save_files_PXD006675/res_HEART_Hurdle.RData"))
load(paste0(wd, "/save_files_PXD006675/overlap_PHM.RData"))

1. Evaluate the imputation: protein level (Supplementary Fig. 1 and 2)

We here show the sample-wise effect of kNN and Perseus imputation at the level of MaxQuant’s LFQ-values (protein level). Blue are the original values, red are the imputed values.

# Import non-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC) <- str_replace(sampleNames(proteinGroups.CPTAC), "LFQ.intensity.", "") %>% make.names

# Import Perseus-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed_imputed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC.Perseus.imp <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC.Perseus.imp) <- str_replace(sampleNames(proteinGroups.CPTAC.Perseus.imp), "LFQ.intensity.", "") %>% make.names

sample.names <- paste0(sampleNames(proteinGroups.CPTAC) %>% substr(3, 3), sampleNames(proteinGroups.CPTAC) %>% substr(5, 5))

### Impute with KNN-1
proteinGroups.CPTAC.KNN1 <- impute(proteinGroups.CPTAC, method = "knn")
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 700 rows with more than 50 % entries missing;
##  mean imputation used for these rows
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))

# 1. Histograms for the effect of Perseus imputation LFQ-level (Supplementary Fig. 1)

# grDevices::svg(paste0(wd,"/Perseus_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(proteinGroups.CPTAC.Perseus.imp))){
p1 <- hist(exprs(proteinGroups.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

# 2. Histograms for the effect of kNN imputation at LFQ-level (Supplementary Fig. 2)

# grDevices::svg(paste0(wd,"/kNN_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(proteinGroups.CPTAC.KNN1))){
p1 <- hist(exprs(proteinGroups.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

2. Evaluate the imputation: peptide level (Supplementary Fig. 3 and 4)

We here show the sample-wise effect of kNN and Perseus imputation at the peptide level. Blue are the original values, red are the imputed values.

# 3. Histograms for the effect of Perseus imputation at peptide-level (Supplementary Fig. 3)

# grDevices::svg(paste0(wd,"/Perseus_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(peptides.CPTAC.Perseus.imp))){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

# 4. Histograms for the effect of kNN imputation at peptide-level (Supplementary Fig. 4)

sample.names <- paste0(sampleNames(peptides.CPTAC.KNN1) %>% substr(3, 3), sampleNames(peptides.CPTAC.KNN1) %>% substr(5, 5))

cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))

# grDevices::svg(paste0(wd,"/kNN_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(peptides.CPTAC.KNN1))){
p1 <- hist(exprs(peptides.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

3. Histograms for the effects of all imputations at peptide-level, grouped per condition (Supplementary Fig. 5)

condition.names <- unique(sampleNames(peptides.CPTAC.KNN1) %>% substr(3, 3))

cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))

# grDevices::svg(paste0(wd,"/histograms_all_imputations.svg"), width=60, height=120)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(4,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

# Perseus imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.Perseus.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# kNN imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.KNN1)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.KNN1)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# Mixed imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.mixed)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.mixed)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Mixed imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# RegEM imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.RegEM.imp)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.RegEM.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("RegEM imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

Prepare for the FDR-FTP plots

# Due to different preprocessing methods, some proteins cannot be detected with some methods.

# By default we use the full set, i.e. all protein identifiers over all methods.
# Note that all methods used either the preprocessing of MSqRob, of Perseus o of MSstats.
all.proteins <- unique(unlist(lapply(list(res.CPTAC.Hurdle, CPTAC.Perseus.full, res.CPTAC.MSstats.noimp.full), "[", "protein")))
full.Base <- tibble(protein = rep(all.proteins, 3), 
                    UPS = grepl("ups", protein),
                    contrast = rep(c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B"), each = length(all.proteins)))

# We will also make some figure with the common set, i.e. only the proteins that have been seen in the preprocessing of al methods.
common.proteins <- unique(unlist(lapply(list(res.CPTAC.Hurdle, CPTAC.Perseus.full, res.CPTAC.MSstats.noimp.full), "[", "protein")))


indices <- which(table(c(unique(as.character(res.CPTAC.Hurdle$protein)), unique(as.character(CPTAC.Perseus.full$protein)), unique(as.character(res.CPTAC.MSstats.noimp.full$protein)))) == 3)

common.proteins <- names(table(c(unique(as.character(res.CPTAC.Hurdle$protein)), unique(as.character(CPTAC.Perseus.full$protein)), unique(as.character(res.CPTAC.MSstats.noimp.full$protein)))))[indices]

common.Base <- tibble(protein = rep(common.proteins, 3), 
                    UPS = grepl("ups", protein),
                    contrast = rep(c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B"), each = length(common.proteins)))

4. Make the FDR-FTP plots for the imputation-based methods (Supplementary Fig. 6)

# FDR-FTP plot for all imputation-based methods

res.list <- list(res.CPTAC.full,
                 res.CPTAC.KNN1.full,
                 res.CPTAC.PI.full,
                 res.CPTAC.RegEM.full,
                 res.CPTAC.mixed.full,
                 res.CPTAC.Hurdle
)

res.list <- lapply(res.list, function(x){
    left_join(full.Base, x)
}
  )
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6A",
                  "Condition 6C vs. 6B")

colors <- c("#FF681E", "#88FF9C", "blue", "black", "brown", "#5A2A82")

sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qchisq", "pchisq", "chisq", NA))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE

5. Correlation between intensities and counts (Supplementary Fig. 7)

# Make the plots

# BA, CB, CA
upratio <- c(1.565597, 1.571906, 3.137504)

# 1. Top: condition 6B vs. 6A

TP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & UPS)

FP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & !UPS)

scatterHist(x = ((FP.BvsA$logOR) %>% unlist),
            y = ((FP.BvsA$logFC) %>% unlist),
            x2 = ((TP.BvsA$logOR) %>% unlist),
            y2 = ((TP.BvsA$logFC) %>% unlist), 
            main = "Condition B vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistBA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistBA.svg")

sum(is.na(((FP.BvsA$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.BvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

# 2. Middle: condition 6C vs. 6A

TP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & UPS)

FP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & !UPS)

scatterHist(x = ((FP.CvsA$logOR) %>% unlist),
            y = ((FP.CvsA$logFC) %>% unlist),
            x2 = ((TP.CvsA$logOR) %>% unlist),
            y2 = ((TP.CvsA$logFC) %>% unlist), 
            main = "Condition C vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCA.svg")

sum(is.na(((FP.CvsA$logFC) %>% unlist)))
## [1] 158
# 158 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

# 3. Bottom: condition 6C vs. 6B

TP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & UPS)

FP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & !UPS)

scatterHist(x = ((FP.CvsB$logOR) %>% unlist),
            y = ((FP.CvsB$logFC) %>% unlist),
            x2 = ((TP.CvsB$logOR) %>% unlist),
            y2 = ((TP.CvsB$logFC) %>% unlist), 
            main = "Condition C vs. B", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCB.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCB.svg")

sum(is.na(((FP.CvsB$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsB$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

6. Make the FDR-FTP plots for count-based methods (Supplementary Fig. 8)

# FDR-FTP plots for the count-based methods

res.list <- list(res.CPTAC.full,
                 res.CPTAC.co.full,
                 res.CPTAC.edgeR.full,
                 res.CPTAC.beta.binom.full
)

res.list <- lapply(res.list, function(x){
    left_join(full.Base, x)
}
  )
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6A",
                  "Condition 6C vs. 6B")

colors <- c("#FF681E", "#E15E9E", "lightgreen","yellow3")

sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logOR"),
                  c("FDR", "PValue", "F", "logFC"),
                  c("qvalue", "p.value"))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE

7. On the independence of the z-statistics under the null hypothesis (Supplementary Fig. 9)

# Create a mock experimental design where there is no differential abundance in reality
# There is however a huge "batch effect" of "condlab", the combined effects of spike-in condition and lab
pData <- pData(peptides.CPTAC2)
pData$condlab <- paste0(pData$condition,pData$lab)
pData$treat <- rep(c("one","two","three"),9)
pData$treat <- factor(pData$treat, levels = c("one", "two", "three"))

peptides.CPTAC2.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC2), fData = AnnotatedDataFrame(fData(peptides.CPTAC2)), pData = AnnotatedDataFrame(pData))

### Recreate peptides.CPTAC3 ###

# We require by default at least 2 identifications of a peptide sequence, as with 1 identification, the model will be perfectly confounded
sel <- rowSums(!is.na(Biobase::exprs(peptides.CPTAC2))) >= 2
peptides.CPTAC3 <- peptides.CPTAC2[sel]

# Again remove proteins that are only identified by one peptide
sel <- fData(peptides.CPTAC3) %>% group_by(protein) %>% mutate(flag = n() > 1) %>% pull(flag)
peptides.CPTAC3 <- peptides.CPTAC3[sel]

# Drop levels
peptides.CPTAC3 <- MSnbase::MSnSet(exprs = Biobase::exprs(peptides.CPTAC3), fData = droplevels(Biobase::fData(peptides.CPTAC3)), pData = Biobase::pData(peptides.CPTAC3))

dim(peptides.CPTAC2)
## [1] 9377   27
# 9377   27
dim(peptides.CPTAC3)
## [1] 9158   27
# 9158   27
length(unique(fData(peptides.CPTAC2)$protein))
## [1] 1381
# 1381
length(unique(fData(peptides.CPTAC3)$protein))
## [1] 1347
# 1347

rm(sel)
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  6312688 337.2   11660632 622.8         NA 11660632 622.8
## Vcells 23180664 176.9   41143755 314.0      16384 34219248 261.1
######

peptides.CPTAC3.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC3), fData = AnnotatedDataFrame(fData(peptides.CPTAC3)), pData = AnnotatedDataFrame(pData))

# Create contrast matrix for the mock analysis
form <- expression ~ (1|treat) + (1|condlab) + (1|sample) + (1|sequence)
contrasts <- contrast_helper(form, peptides.CPTAC3.noeff, treat)
contrasts <- contrasts[c(1,3,2),c(2,1,3)]
colnames(contrasts)[3] <- "treatthree-treattwo"
contrasts[c(2,3),3] <- c(-1,1)
contrasts
##            treattwo-treatone treatthree-treatone treatthree-treattwo
## treatone                  -1                  -1                   0
## treattwo                   1                   0                  -1
## treatthree                 0                   1                   1
# Fit the models, this might take a few minutes
# res.CPTAC.noeff <- do_mm(formula = form, msnset = peptides.CPTAC3.noeff, group_var = protein, contrasts = contrasts, type_df = "conservative", max_iter = 20)
# Or load the data
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff.RData"))

annotation.CPTAC <- tibble(protein = fData(peptides.CPTAC2)$protein, UPS = grepl("ups",fData(peptides.CPTAC2)$protein), gene.name = fData(peptides.CPTAC2)$gene.name, protein.name = fData(peptides.CPTAC2)$protein.name) %>% distinct
res.CPTAC.Base <- rbind(annotation.CPTAC, annotation.CPTAC, annotation.CPTAC)
res.CPTAC.Base$contrast <- c(rep("treattwo-treatone",nrow(annotation.CPTAC)), rep("treatthree-treatone",nrow(annotation.CPTAC)), rep("treatthree-treattwo",nrow(annotation.CPTAC)))

res.CPTAC.noeff.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff$result)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% arrange(pvalue)

prot.CPTAC.noeff.co <- do_count_groups(peptides.CPTAC2.noeff, group_var = protein, keep_fData_cols = c("gene.name","protein.name"))

# Remove estimates based on only one sample in one or more conditions from the data

not_one <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(1,25, by=3)] > 0) %>% rowSums < 2)
not_two <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(2,26, by=3)] > 0) %>% rowSums < 2)
not_three <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(3,27, by=3)] > 0) %>% rowSums < 2)

### Important: re-adjust the false discovery rate! ###

res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_one)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treatone")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_two)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_three)) & (res.CPTAC.noeff.full$contrast %in% c("treatthree-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA

res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% group_by(contrast) %>% 
  mutate(qvalue = p.adjust(pvalue, method = "BH"))

# Or load the MSqRob result object via:
# load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_full.RData"))

### Fit the quasi-binomial model ###

form.co <- ~ treat + condlab
# contrasts <- contrast_helper(~ treat, peptides.CPTAC2.noeff, treat)

# res.CPTAC.noeff.co <- do_glm(formula = form.co, msnset = prot.CPTAC.noeff.co, group_var = protein, contrasts = contrasts, contFun = "contEst")
# OR: load the model:
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_co.RData"))

res.CPTAC.noeff.co.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff.co$result)
## Joining, by = c("protein", "contrast")
res.CPTAC.noeff.co.full <- res.CPTAC.noeff.co.full %>% arrange(pvalue)

# cols <-  colorRampPalette(c("#FF3100", "#FCFF00", "#45FE4F",
#                             "#00FEFF", "#000099" 
# ))(256)

# col = cols[ceiling(rank(HEART_A_vsV$pchisq)/length(HEART_A_vsV$pchisq)*256)]

# Order results according to protein and contrast instead of p-value
res.intensities.sorted <- res.CPTAC.noeff.full %>% arrange(contrast, protein)
res.counts.sorted <- res.CPTAC.noeff.co.full %>% arrange(contrast, protein)

all(res.intensities.sorted$protein == res.counts.sorted$protein)
## [1] TRUE
all(res.intensities.sorted$contrast == res.counts.sorted$contrast)
## [1] TRUE
# library(svglite)
# svglite(file = "all_mock_correlations.svg", width = 14, height = 14)
# # svglite(file = "no_correlation_MSqRob_qbinom_21.svg", width = 7, height = 7)
# par(mar=c(5.1,6.1,4.1,2.1))
# par(mfrow = c(2,2))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treattwo-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treattwo-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 2 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
# 
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_31.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
# 
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_32.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treattwo",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treattwo",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 2", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# par(mfrow = c(1,1))
# dev.off()

8. For the common set, make the FDR-FTP plots for the methods shown in the main paper (Supplementary Fig. 10)

res.list <- list(res.CPTAC.MSstats.full[!is.infinite(res.CPTAC.MSstats.full$log2FC),],
                 res.CPTAC.MSstats.noimp.full[!is.infinite(res.CPTAC.MSstats.noimp.full$log2FC),],
                 CPTAC.Perseus.imp.full,
                 CPTAC.Perseus.full,
                 res.CPTAC.RegEM.full,
                 res.CPTAC.co.full,
                 res.CPTAC.full,
                 res.CPTAC.Hurdle)

res.list <- lapply(res.list, function(x){
    left_join(common.Base, x)
}
  )
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6A",
                  "Condition 6C vs. 6B")

colors <- c("#808000", "green", "#42B7BA", "#1B2944", "black", "#E15E9E", "#FF681E", "#5A2A82") # darkblue: "#1B2944"  gray: "#50FF00"

sort.list <- list(c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logOR"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qchisq", "pchisq", "chisq", NA))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE

9. Make the FDR-FTP plots for the other state-of-the-art methods (Supplementary Fig. 11)

res.list <- list(res.CPTAC.ProPCA.full, 
     res.CPTAC.DEP,
     res.CPTAC.mixture,
     res.CPTAC.ProDA.full,
     res.CPTAC.DEP.noimp,
     res.CPTAC.full,
     res.CPTAC.Hurdle)


res.list <- lapply(res.list, function(x){
    left_join(full.Base, x)
}
  )
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector

## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6A",
                  "Condition 6C vs. 6B")

colors <- c("red2", "peru", "#FFC30B", "magenta", "yellow3", "#FF681E", "#5A2A82")

sort.list <- list(c("adj.P.Val", "P.Value", "t", "logFC"), 
     c("qvalue", "pvalue", "logFC"),
     c("qvalue", "pvalue"),
     c("qval", "pval", "t", "logFC"),
     c("qvalue", "pvalue", "logFC"),
     c("qvalue", "pvalue", "t", "logFC"),
     c("qchisq", "pchisq", "chisq", NA))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE

10. For the common set, make the FDR-FTP plots for other state-of-the-art methods (Supplementary Fig. 12)

res.list <- list(res.CPTAC.ProPCA.full, 
     res.CPTAC.DEP,
     res.CPTAC.mixture,
     res.CPTAC.ProDA.full,
     res.CPTAC.DEP.noimp,
     res.CPTAC.full,
     res.CPTAC.Hurdle)

res.list <- lapply(res.list, function(x){
    left_join(common.Base, x)
}
  )
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6A",
                  "condition6C-condition6B")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6A",
                  "Condition 6C vs. 6B")

colors <- c("red2", "peru", "#FFC30B", "magenta", "yellow3", "#FF681E", "#5A2A82")

sort.list <- list(c("adj.P.Val", "P.Value", "t", "logFC"), 
     c("qvalue", "pvalue", "logFC"),
     c("qvalue", "pvalue"),
     c("qval", "pval", "t", "logFC"),
     c("qvalue", "pvalue", "logFC"),
     c("qvalue", "pvalue", "t", "logFC"),
     c("qchisq", "pchisq", "chisq", NA))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE

11. Plot example proteins for each of the overlapping regions in the Venn diagram (Supplementary Fig. 13 - 17)

Aditionally, we here provide the plots for the underlying (preprocessed) peptide-level data for the 1500 most significant proteins for each method in the HEART dataset, grouped per method.

### First 1500 most significant genes ###

# A. Select top 1500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1500]

# B. Select top 1500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1500 <- hurdle.AvsV.ov[1:1500,]$gene.name

# C. Select top 1500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1500 <- MSqRob.AvsV.ov[1:1500,]$gene.name


### Overlap all (Supplementary Fig. 8) ###

plot_proteins(gene.names = "MYL7", title = "MYL7", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "PAM", title = "PAM", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MYBPHL", title = "MYBPHL", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MAP1B", title = "MAP1B", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("MYL7", "PAM", "MYBPHL", "MAP1B"), title = c("overlap_all.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from all three methods
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & (genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_all.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap Hurdle and Perseus (Supplementary Fig. 9) ###

plot_proteins(gene.names = "CA13", title = "CA13", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "NTN1", title = "NTN1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "PRKG2", title = "PRKG2", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "SBF2", title = "SBF2", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("CA13", "NTN1", "PRKG2", "SBF2"), title = c("overlap_hurdle_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and Perseus and against MSqRob
genes <- genes.Hurdle.AvsV.1500[(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_hurdle_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Hurdle alone (Supplementary Fig. 10) ###

plot_proteins(gene.names = "HTT", title = "HTT", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ACACA", title = "ACACA", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ABCA6", title = "ABCA6", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "FTCD", title = "FTCD", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("HTT", "ACACA", "ABCA6", "FTCD"), title = c("only_hurdle.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and against MSqRob and Perseus
genes <- genes.Hurdle.AvsV.1500[!(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Perseus alone (Supplementary Fig. 11) ###

plot_proteins(gene.names = "GJA5", title = "GJA5", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ASNSD1", title = "ASNSD1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MTCL1", title = "MTCL1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "BLOC1S5;BLOC1S5-TXNDC5", title = "BLOC1S5;BLOC1S5-TXNDC5", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("GJA5", "ASNSD1", "MTCL1", "BLOC1S5;BLOC1S5-TXNDC5"), title = c("only_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Perseus and against Hurdle and MSqRob
genes <- genes.Perseus.AvsV.1500[!(genes.Perseus.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Perseus.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap MSqRob and Perseus (Supplementary Fig. 12) ###

plot_proteins(gene.names = "AK4", title = "AK4", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "NDUFV1", title = "NDUFV1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "DECR1", title = "DECR1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "SDHA", title = "SDHA", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("AK4", "NDUFV1", "DECR1", "SDHA"), title = c("overlap_MSqRob_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from MSqRob and Perseus and against Hurdle
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### MSqRob alone ###

# Order p-values by evidence from MSqRob and against Hurdle and Perseus
genes <- genes.MSqRob.AvsV.1500[!(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_MSqRob.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap MSqRob and Hurdle ###

# Order p-values by evidence from MSqRob and Hurdle and against Perseus
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)

12. Generate the data for the other Venn diagrams for the HEART dataset (Supplementary Fig. 18)

### Venn diagrams atrial vs. ventricular region ###

### Checks: ###

all(Perseus.AvsV.ov$`Gene names` %in% hurdle.AvsV.ov$gene.name)
## [1] TRUE
all(Perseus.AvsV.ov$`Gene names` %in% MSqRob.AvsV.ov$gene.name)
## [1] TRUE
all(hurdle.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
all(MSqRob.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
dim(MSqRob.AvsV.ov)
## [1] 7822    9
dim(hurdle.AvsV.ov)
## [1] 7822    8
length(unique(Perseus.AvsV.ov$`Gene names`))
## [1] 7822
################

### Top: the first 500 significant genes ###

# A. Select top 500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:500]

# B. Select top 500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.500 <- hurdle.AvsV.ov[1:500,]$gene.name

# C. Select top 500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.500 <- MSqRob.AvsV.ov[1:500,]$gene.name

# MSqRob alone

sum(!(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 99
# 99

# Hurdle alone

sum(!(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 85
# 85

# Perseus alone

sum(!(genes.Perseus.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Perseus.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218

# Overlap MSqRob and Hurdle

sum((genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 158
# 158

# Overlap MSqRob and Perseus

sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 25
# 25

# Overlap Hurdle and Perseus

sum((genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500))
## [1] 39
# 39

# Overlap everything

sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & (genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218

# Control:
length(genes.MSqRob.AvsV.500)
## [1] 500
99+218+158+25
## [1] 500
length(genes.Hurdle.AvsV.500)
## [1] 500
85+158+218+39
## [1] 500
length(genes.Perseus.AvsV.500)
## [1] 500
218+39+25+218
## [1] 500
### Bottom: the first 1000 significant genes ###

# A. Select top 1000 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1000 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1000]

# B. Select top 1000 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1000 <- hurdle.AvsV.ov[1:1000,]$gene.name

# C. Select top 1000 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1000 <- MSqRob.AvsV.ov[1:1000,]$gene.name

# MSqRob alone

sum(!(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 174
# 174

# Hurdle alone

sum(!(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 134
# 134

# Perseus alone

sum(!(genes.Perseus.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Perseus.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 439
# 439

# Overlap MSqRob and Hurdle

sum((genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 363
# 363

# Overlap MSqRob and Perseus

sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 58
# 58

# Overlap Hurdle and Perseus

sum((genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000))
## [1] 98
# 98

# Overlap everything

sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & (genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 405
# 405

# Control:
length(genes.MSqRob.AvsV.1000)
## [1] 1000
174+363+58+405
## [1] 1000
length(genes.Hurdle.AvsV.1000)
## [1] 1000
134+363+98+405
## [1] 1000
length(genes.Perseus.AvsV.1000)
## [1] 1000
439+58+98+405
## [1] 1000